from plotnine import *
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import math
from numpy.linalg import norm
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
%matplotlib inline
sns.set()
warnings.filterwarnings("ignore")
def multiple_histogram(data):
num_row = math.ceil(len(data.columns) / 3)
fig = make_subplots(rows=num_row, cols=3,subplot_titles=(df.columns))
place_col = 1
place_row = 1
for i in data.columns:
fig.add_trace(go.Histogram(x=data[i]),row=place_row, col=place_col)
place_col += 1
if place_col == 4:
place_col = 1
place_row += 1
return fig.update_layout(height=1600, width=1100,
title_text="Multiple Histogram for all featrues")
def multiple_boxplot(data):
num_row = math.ceil(len(data.columns) / 3)
fig = make_subplots(rows=num_row, cols=3,subplot_titles=(df.columns))
place_col = 1
place_row = 1
for i in data.columns:
fig.add_trace(go.Box(y=data[i]),row=place_row, col=place_col)
place_col += 1
if place_col == 4:
place_col = 1
place_row += 1
return fig.update_layout(height=1600, width=1100,
title_text="Multiple Histogram for all featrues")
def check_nulls(X_data):
unusual_nulls = X_data[X_data.isin(['{}','[]', "?", ".", "-", "_", "", " ", " "])].sum()
nulls_df = pd.concat([X_data.isna().sum(), unusual_nulls], axis=1)
nulls_df.columns = ["usual_nulls", "strange_nulls"]
nulls_df = nulls_df.sort_values('usual_nulls', ascending = False)
return nulls_df
########################################################
def kMeansInitCentroids(X, K):
centroids=np.zeros((K, X.shape[1]))
sample_size = int(X.shape[0]/K)
for i in range(K):
index = np.random.randint(X.shape[0], size=sample_size)
new_centriod = sum(X[index])/sample_size
centroids[i, :]=new_centriod
return centroids
def findClosestCentroids(X, centroids):
matrix=np.zeros((X.shape[0], centroids.shape[0]))
for i in range(centroids.shape[0]):
diffrences = X-centroids[i]
norm_vector = np.linalg.norm(diffrences, axis=1)
matrix[:,i]=norm_vector
idx=np.argmin(matrix, axis=1).reshape(-1,1)
return idx
def computeCentroids(X, idx, K, centroids_old):
centroids=np.zeros((K, X.shape[1]))
idx = idx.reshape(X.shape[0])
for i in range(K):
if len(X[idx==i]) ==0:
centroids[i] = centroids_old[i]
else:
centroids[i]=np.mean(X[idx==i], axis=0)
return centroids
def k_means(X,K):
X_data = pd.DataFrame(X)
list_of_idx=[]
list_of_centroids=[]
list_of_sse=[]
for j in range(100):
centroids = kMeansInitCentroids(X, K)
inertia = 0
for i in range(300):
idx = findClosestCentroids(X, centroids)
centroids = computeCentroids(X, idx, K, centroids)
for i in range(K):
if len(X_data[idx==i]) == 0:
continue
else:
diff= X_data[idx==i]-centroids[i]
inertia += sum(np.square(norm(diff, axis=1)))
list_of_idx.append(idx)
list_of_centroids.append(centroids)
list_of_sse.append(inertia)
min_sse_index = np.argmin(list_of_sse)
return list_of_centroids[min_sse_index], list_of_idx[min_sse_index], min(list_of_sse)
def elbow(X, K):
import plotly.express as px
variances=[]
for i in range(1, K+1):
centroids, idx, variance = k_means(X, i)
variances.append(variance)
return px.line(x=np.arange(1,K+1), y=variances, markers=True,labels ={'x':"number of cetriods", 'y':"Inertia"},
range_y=[0, max(variances)+200] ,title="Sum of variances VS number of Cenriods in agiven data set")
def predict(centroids,X):
matrix=np.zeros((X.shape[0], centroids.shape[0]))
for i in range(centroids.shape[0]):
diffrences = X-centroids[i]
norm_vector = np.linalg.norm(diffrences, axis=1)
matrix[:,i]=norm_vector
label=np.argmin(matrix, axis=1).reshape(-1,1)
return label
#read the data
df = pd.read_csv('CC GENERAL.csv')
df.drop('CUST_ID', axis=1, inplace=True)
print('The shape of the dataset is:', df.shape)
The shape of the dataset is: (8950, 17)
df.head()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
# Let's see the data types and non-null values for each column
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8950 entries, 0 to 8949 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 BALANCE 8950 non-null float64 1 BALANCE_FREQUENCY 8950 non-null float64 2 PURCHASES 8950 non-null float64 3 ONEOFF_PURCHASES 8950 non-null float64 4 INSTALLMENTS_PURCHASES 8950 non-null float64 5 CASH_ADVANCE 8950 non-null float64 6 PURCHASES_FREQUENCY 8950 non-null float64 7 ONEOFF_PURCHASES_FREQUENCY 8950 non-null float64 8 PURCHASES_INSTALLMENTS_FREQUENCY 8950 non-null float64 9 CASH_ADVANCE_FREQUENCY 8950 non-null float64 10 CASH_ADVANCE_TRX 8950 non-null int64 11 PURCHASES_TRX 8950 non-null int64 12 CREDIT_LIMIT 8949 non-null float64 13 PAYMENTS 8950 non-null float64 14 MINIMUM_PAYMENTS 8637 non-null float64 15 PRC_FULL_PAYMENT 8950 non-null float64 16 TENURE 8950 non-null int64 dtypes: float64(14), int64(3) memory usage: 1.2 MB
check_nulls(df)
| usual_nulls | strange_nulls | |
|---|---|---|
| MINIMUM_PAYMENTS | 313 | 0.0 |
| CREDIT_LIMIT | 1 | 0.0 |
| BALANCE | 0 | 0.0 |
| CASH_ADVANCE_FREQUENCY | 0 | 0.0 |
| PRC_FULL_PAYMENT | 0 | 0.0 |
| PAYMENTS | 0 | 0.0 |
| PURCHASES_TRX | 0 | 0.0 |
| CASH_ADVANCE_TRX | 0 | 0.0 |
| PURCHASES_INSTALLMENTS_FREQUENCY | 0 | 0.0 |
| BALANCE_FREQUENCY | 0 | 0.0 |
| ONEOFF_PURCHASES_FREQUENCY | 0 | 0.0 |
| PURCHASES_FREQUENCY | 0 | 0.0 |
| CASH_ADVANCE | 0 | 0.0 |
| INSTALLMENTS_PURCHASES | 0 | 0.0 |
| ONEOFF_PURCHASES | 0 | 0.0 |
| PURCHASES | 0 | 0.0 |
| TENURE | 0 | 0.0 |
# This will print basic statistics for numerical columns
df.describe()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8949.000000 | 8950.000000 | 8637.000000 | 8950.000000 | 8950.000000 |
| mean | 1564.474828 | 0.877271 | 1003.204834 | 592.437371 | 411.067645 | 978.871112 | 0.490351 | 0.202458 | 0.364437 | 0.135144 | 3.248827 | 14.709832 | 4494.449450 | 1733.143852 | 864.206542 | 0.153715 | 11.517318 |
| std | 2081.531879 | 0.236904 | 2136.634782 | 1659.887917 | 904.338115 | 2097.163877 | 0.401371 | 0.298336 | 0.397448 | 0.200121 | 6.824647 | 24.857649 | 3638.815725 | 2895.063757 | 2372.446607 | 0.292499 | 1.338331 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.000000 | 0.019163 | 0.000000 | 6.000000 |
| 25% | 128.281915 | 0.888889 | 39.635000 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 383.276166 | 169.123707 | 0.000000 | 12.000000 |
| 50% | 873.385231 | 1.000000 | 361.280000 | 38.000000 | 89.000000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 856.901546 | 312.343947 | 0.000000 | 12.000000 |
| 75% | 2054.140036 | 1.000000 | 1110.130000 | 577.405000 | 468.637500 | 1113.821139 | 0.916667 | 0.300000 | 0.750000 | 0.222222 | 4.000000 | 17.000000 | 6500.000000 | 1901.134317 | 825.485459 | 0.142857 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
multiple_histogram(df)
multiple_boxplot(df)
plt.figure(figsize = (14,8))
sns.heatmap(df.corr(),annot=True,linewidths=0.1, cmap= 'RdBu');
#make a copy for the original dataset
df_copy=df.copy()
#solution
df_copy.dropna(axis =0, inplace =True )
df_copy.isnull().sum()
BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 0 PAYMENTS 0 MINIMUM_PAYMENTS 0 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
df_logged = np.log1p(df_copy)
df_logged.describe()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 | 8636.000000 |
| mean | 6.265737 | 0.632000 | 4.928905 | 3.239500 | 3.387883 | 3.349135 | 0.365169 | 0.161321 | 0.273244 | 0.115484 | 0.829327 | 1.916439 | 8.099572 | 6.814890 | 5.922564 | 0.122011 | 2.521363 |
| std | 1.895982 | 0.126199 | 2.922819 | 3.252619 | 3.091009 | 3.571114 | 0.277233 | 0.217894 | 0.282259 | 0.157941 | 1.015146 | 1.378707 | 0.822341 | 1.159994 | 1.190068 | 0.214214 | 0.127535 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.931826 | 0.048326 | 0.018982 | 0.000000 | 1.945910 |
| 25% | 5.004584 | 0.646627 | 3.792507 | 0.000000 | 0.000000 | 0.000000 | 0.080042 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.693147 | 7.378384 | 6.039205 | 5.136760 | 0.000000 | 2.564949 |
| 50% | 6.822040 | 0.693147 | 5.930666 | 3.828533 | 4.562106 | 0.000000 | 0.405465 | 0.080042 | 0.154151 | 0.000000 | 0.000000 | 2.079442 | 8.006701 | 6.799809 | 5.747647 | 0.000000 | 2.564949 |
| 75% | 7.652639 | 0.693147 | 7.044888 | 6.397096 | 6.184453 | 7.032964 | 0.650588 | 0.287682 | 0.559616 | 0.223144 | 1.609438 | 2.944439 | 8.779711 | 7.576683 | 6.717196 | 0.154151 | 2.564949 |
| max | 9.854515 | 0.693147 | 10.800403 | 10.615512 | 10.021315 | 10.760839 | 0.693147 | 0.693147 | 0.693147 | 0.916291 | 4.820282 | 5.883322 | 10.308986 | 10.834125 | 11.243832 | 0.693147 | 2.564949 |
from sklearn.preprocessing import StandardScaler
scal = StandardScaler()
df_scaled = scal.fit_transform(df_logged)
from sklearn.manifold import TSNE
tsne_projection = TSNE(n_components=2,
perplexity=30,
learning_rate=1,
init='pca',
method='barnes_hut',
n_jobs=-1,
n_iter=10**4,
random_state=42).fit_transform(df_scaled)
from sklearn.decomposition import PCA
pca_scaled = PCA()
df_scaled_red = pca_scaled.fit_transform(df_scaled)
d = {'Number of EigenVector': np.arange(1,len(df_copy.columns)+1), 'Varaiability Captured': np.cumsum(pca_scaled.explained_variance_ratio_)}
df = pd.DataFrame(data=d)
px.line(df, x='Number of EigenVector', y='Varaiability Captured',markers=True,
title='EigenVectors VS Varaiability Captured')
#### KMeans + Scaled Data + Reduced Features ( 7 Components)
pca_scaled = PCA(n_components=10)
df_scaled_red = pca_scaled.fit_transform(df_scaled)
inertia = []
K = 20 #number of cluster
silhouette_scores =[]
for i in range(2,K+1):
kmeans = KMeans(n_clusters=i, random_state=0).fit(df_scaled_red)
inertia.append(kmeans.inertia_)
labels = kmeans.labels_
score = silhouette_score(df_scaled_red, labels)
silhouette_scores.append(score)
px.line(x=np.arange(2,K+1), y=inertia, markers=True,labels ={'x':"number of cetriods", 'y':"Inertia"},
range_y=[0, max(inertia)+200] ,title="Sum of variances VS number of Cenriods in agiven data set")
#### KMeans + Scaled Data + Reduced Features ( 7 Components)
px.line(x=np.arange(2,K+1), y=silhouette_scores, markers=True,labels ={'x':"number of clusters", 'y':"silhouette_scores"},
range_y=[0,silhouette_scores] ,title="silhouette_scores VS number of clusters in agiven data set")
#### KMeans + logged Data + Reduced Features ( 7 Components)
kmeans = KMeans(n_clusters=7, random_state=42).fit(df_scaled_red)
tsne_projection = pd.DataFrame(tsne_projection, columns=['feature1', 'feature2'],index=df_copy.index.tolist())
tsne_projection['cluster'] = kmeans.labels_
px.scatter(tsne_projection, x='feature1', y='feature2', color="cluster")
#### AgglomerativeClustering + Scaled Data + Reduced Features ( 7 Components)
from sklearn.cluster import AgglomerativeClustering
linkage = ['ward', 'complete', 'average', 'single']
K = 20 #number of cluster
silhouette_score_dict={}
for link in linkage:
scores=[]
for i in range(2,K+1):
clustering = AgglomerativeClustering(n_clusters=i, linkage=link).fit(df_scaled_red)
labels = clustering.labels_
score = silhouette_score(df_scaled_red, labels)
scores.append(score)
silhouette_score_dict[link] = scores
silhouette_score_dataframe = pd.DataFrame(silhouette_score_dict,index=np.arange(2,K+1))
px.line(silhouette_score_dataframe)
#### AgglomerativeClustering + Scaled Data + Reduced Features ( 7 Components)
clustering = AgglomerativeClustering(n_clusters=7, linkage='ward').fit(df_scaled_red)
tsne_projection = pd.DataFrame(tsne_projection, columns=['feature1', 'feature2'],index=df_copy.index.tolist())
tsne_projection['cluster'] = clustering.labels_
px.scatter(tsne_projection, x='feature1', y='feature2', color="cluster")
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
# we ask for mins + 1 nearest, because the data point it self (distance = 0) is included
mins = 5
knn = NearestNeighbors(mins + 1)
knn.fit(df_scaled_red)
distances, neighbors = knn.kneighbors(df_scaled_red)
# sort the distances
distances = np.sort(distances[:, mins], axis = 0)
distances
array([0.13219808, 0.15933425, 0.15945544, ..., 3.92373999, 4.07551868,
4.24326215])
#plot the distances
distances_df = pd.DataFrame({"distances": distances,
"index": list(range(0,len(distances)))})
plt = (ggplot(distances_df, aes(x = "index", y = "distances")) +
geom_line(color = "white", size = 2) + theme_minimal() +
labs(title = "Elbow Method for Choosing eps") +
theme(panel_grid_minor = element_blank(),
rect = element_rect(fill = "#202124ff"),
axis_text = element_text(color = "white"),
axis_title = element_text(color = "white"),
plot_title = element_text(color = "white"),
panel_border = element_line(color = "darkgray"),
plot_background = element_rect(fill = "#202124ff")
))
ggsave(plot=plt, filename='elbow.png', dpi=300)
plt
<ggplot: (149324301529)>
db1 = DBSCAN(eps = 2, min_samples = 5).fit(df_scaled)
labels = db1.labels_
silhouette_score(df_scaled, labels),pd.DataFrame(labels).value_counts()
(0.11012133070228931, 0 8295 -1 315 3 8 1 7 2 6 4 5 dtype: int64)
db1 = DBSCAN(eps = 2, min_samples = 5).fit(df_scaled)
tsne_projection = pd.DataFrame(tsne_projection, columns=['feature1', 'feature2'],index=df_copy.index.tolist())
tsne_projection['cluster'] = db1.labels_
px.scatter(tsne_projection, x='feature1', y='feature2', color="cluster")